#!/usr/bin/env python3
# vim:fileencoding=utf-8

import os
import sys
import subprocess
import urllib.request
import urllib.parse
import dbm.gnu
from lxml.html import fromstring

imgdb = None
tmpfile = '/tmp/tempimagefile'

def main(url):
  global imgdb
  imgdb = dbm.gnu.open(os.path.expanduser('~/scripts/python/pydata/xs8.db'), 'cs')
  page = urllib.request.urlopen(url).read()
  doc = fromstring(page.decode('gb18030'))
  doc.make_links_absolute(url)
  title = doc.cssselect('.covertt > .ctt > i')[0].text
  author = doc.cssselect('.covertt > a')[0].text
  links = doc.cssselect('#zjlist li > a')
  print('下载：《%s》（%s），共 %d 章。' % (title, author, len(links)))
  with open(title + '_' + author + '.txt', 'w') as f:
    print(title, author, sep='\n', end='\n\n', file=f)
    for i, l in enumerate(links):
      print(l.text, end='\n\n', file=f)
      print('%d. %s' % (i, l.text))
      onepage(l.get('href'), f)

def onepage(url, f):
  page = urllib.request.urlopen(url).read()
  doc = fromstring(page.decode('gb18030'))
  content = doc.cssselect('#zoom')[0]
  f.write(content.text.strip('\n ') + '\n')
  for el in content:
    if el.tag == 'br':
      if el.tail:
        f.write(el.tail.lstrip(' '))
    elif el.tag == 'img':
      d = gettext4img(urllib.parse.urljoin(url, el.get('src')))
      if el.tail:
        d += el.tail
      f.write(d)
  f.write('\n\n')

def gettext4img(url):
  burl = url.encode()
  ch = imgdb.get(burl, None)
  if not ch:
    image = urllib.request.urlopen(url).read()
    with open(tmpfile, 'wb') as f:
      f.write(image)
    subprocess.call(['sxiv', tmpfile])
    ch = input('> ').strip()
    imgdb[burl] = ch.encode()
    os.unlink(tmpfile)
  else:
    ch = ch.decode()
  return ch

if __name__ == '__main__':
  if len(sys.argv) != 2:
    sys.exit('A URL should be given.')
  main(sys.argv[1])

